import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',None)
data=pd.read_csv('Visadataset.csv')
data.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
data.shape
(25480, 12)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 25480 entries, 0 to 25479 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 case_id 25480 non-null object 1 continent 25480 non-null object 2 education_of_employee 25480 non-null object 3 has_job_experience 25480 non-null object 4 requires_job_training 25480 non-null object 5 no_of_employees 25480 non-null int64 6 yr_of_estab 25480 non-null int64 7 region_of_employment 25480 non-null object 8 prevailing_wage 25480 non-null float64 9 unit_of_wage 25480 non-null object 10 full_time_position 25480 non-null object 11 case_status 25480 non-null object dtypes: float64(1), int64(2), object(9) memory usage: 2.3+ MB
# Display summary statistics for a dataframe
data.describe()
| no_of_employees | yr_of_estab | prevailing_wage | |
|---|---|---|---|
| count | 25480.000000 | 25480.000000 | 25480.000000 |
| mean | 5667.043210 | 1979.409929 | 74455.814592 |
| std | 22877.928848 | 42.366929 | 52815.942327 |
| min | -26.000000 | 1800.000000 | 2.136700 |
| 25% | 1022.000000 | 1976.000000 | 34015.480000 |
| 50% | 2109.000000 | 1997.000000 | 70308.210000 |
| 75% | 3504.000000 | 2005.000000 | 107735.512500 |
| max | 602069.000000 | 2016.000000 | 319210.270000 |
numeric_features=[features for features in data.columns if data[features].dtype!='O']
print('we have {} numerical features : {}'.format(len(numeric_features),numeric_features))
we have 3 numerical features : ['no_of_employees', 'yr_of_estab', 'prevailing_wage']
categoric_features=[feature for feature in data.columns if data[feature].dtype=='O']
print('we have {} categorical features : {}'.format(len(categoric_features),categoric_features))
we have 9 categorical features : ['case_id', 'continent', 'education_of_employee', 'has_job_experience', 'requires_job_training', 'region_of_employment', 'unit_of_wage', 'full_time_position', 'case_status']
for col in categoric_features:
print(data[col].value_counts(normalize=True)*100)
print('---------------------------')
EZYV7632 0.003925
EZYV1968 0.003925
EZYV2434 0.003925
EZYV4917 0.003925
EZYV10934 0.003925
...
EZYV6648 0.003925
EZYV17020 0.003925
EZYV20231 0.003925
EZYV17436 0.003925
EZYV13891 0.003925
Name: case_id, Length: 25480, dtype: float64
---------------------------
Asia 66.173469
Europe 14.646782
North America 12.919937
South America 3.343799
Africa 2.162480
Oceania 0.753532
Name: continent, dtype: float64
---------------------------
Bachelor's 40.164835
Master's 37.810047
High School 13.422292
Doctorate 8.602826
Name: education_of_employee, dtype: float64
---------------------------
Y 58.092622
N 41.907378
Name: has_job_experience, dtype: float64
---------------------------
N 88.402669
Y 11.597331
Name: requires_job_training, dtype: float64
---------------------------
Northeast 28.237834
South 27.539246
West 25.847724
Midwest 16.903454
Island 1.471743
Name: region_of_employment, dtype: float64
---------------------------
Year 90.117739
Hour 8.465463
Week 1.067504
Month 0.349294
Name: unit_of_wage, dtype: float64
---------------------------
Y 89.375981
N 10.624019
Name: full_time_position, dtype: float64
---------------------------
Certified 66.789639
Denied 33.210361
Name: case_status, dtype: float64
---------------------------
The term univariate analysis refers to the analysis of one variable prefix “uni” means “one.” The purpose of univariate analysis is to understand the distribution of values for a single variable
plt.figure(figsize=(15,10))
plt.suptitle('Univariate Analysis of Numerical Features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)
for i in range(0, len(numeric_features)):
#for i in range(0, len(numeric_features)):
plt.subplot(2, 2, i+1)
sns.kdeplot(x=data[numeric_features[i]], color='blue')
plt.xlabel(numeric_features[i])
plt.tight_layout()
# categorical columns
categoric_features.remove('case_id')
plt.figure(figsize=(20,20))
plt.suptitle('Univariate Analysis of Categorical Features', fontsize=20, fontweight='bold', alpha=0.8, y=1.)
for i in range(0, len(categoric_features)):
#for i in range(0, len(numeric_features)):
plt.subplot(4, 2, i+1)
sns.countplot(x=data[categoric_features[i]],ec="black",palette='Set1')
plt.xlabel(categoric_features[i])
plt.tight_layout()
Multivariate analysis is the analysis of more than one variable.
data[(list(data.columns)[1:])].corr()
| no_of_employees | yr_of_estab | prevailing_wage | |
|---|---|---|---|
| no_of_employees | 1.000000 | -0.017770 | -0.009523 |
| yr_of_estab | -0.017770 | 1.000000 | 0.012342 |
| prevailing_wage | -0.009523 | 0.012342 | 1.000000 |
df1=data.copy()
plt.figure(figsize=(15,10))
sns.heatmap(data.corr(), annot=True,cmap="viridis")
<AxesSubplot:>
There is no multicollinearity between any variables
from scipy.stats import chi2_contingency
chi2_test=[]
for feature in categoric_features:
if chi2_contingency(pd.crosstab(data['case_status'], data[feature]))[1] < 0.05:
chi2_test.append('Reject Null Hypothesis')
else:
chi2_test.append('Fail to Reject Null Hypothesis')
result = pd.DataFrame(data=[categoric_features, chi2_test]).T
result.columns = ['Column', 'Hypothesis Result']
result
| Column | Hypothesis Result | |
|---|---|---|
| 0 | continent | Reject Null Hypothesis |
| 1 | education_of_employee | Reject Null Hypothesis |
| 2 | has_job_experience | Reject Null Hypothesis |
| 3 | requires_job_training | Fail to Reject Null Hypothesis |
| 4 | region_of_employment | Reject Null Hypothesis |
| 5 | unit_of_wage | Reject Null Hypothesis |
| 6 | full_time_position | Reject Null Hypothesis |
| 7 | case_status | Reject Null Hypothesis |
Here requires_job_training fails to Rejects Null Hypothesis which means it doesn't correlate with target column.
data.isnull().sum()
case_id 0 continent 0 education_of_employee 0 has_job_experience 0 requires_job_training 0 no_of_employees 0 yr_of_estab 0 region_of_employment 0 prevailing_wage 0 unit_of_wage 0 full_time_position 0 case_status 0 dtype: int64
There are no missing values.
continues_features=[feature for feature in numeric_features if len(data[feature].unique())>=10]
print('we have {} numerical features:{}'.format(len(continues_features),continues_features))
we have 3 numerical features:['no_of_employees', 'yr_of_estab', 'prevailing_wage']
clr1 = ['#1E90FF', '#DC143C']
fig, ax = plt.subplots(3, 2, figsize=(10,12))
fig.suptitle('Distribution of Numerical Features By Case Status', color='#3C3744',
fontsize=20, fontweight='bold', ha='center')
for i, col in enumerate(continues_features,0):
sns.boxplot(data=data, x='case_status', y=col, palette=clr1, ax=ax[i,0])
ax[i,0].set_title(f'Boxplot of {col}', fontsize=12)
sns.histplot(data=data, x=col, hue='case_status', bins=20, kde=True,
multiple='stack', palette=clr1, ax=ax[i,1])
ax[i,1].set_title(f'Histogram of {col}', fontsize=14)
fig.tight_layout()
fig.subplots_adjust(top=0.90)
values=data['case_status'].value_counts(normalize=True)*100
values
Certified 66.789639 Denied 33.210361 Name: case_status, dtype: float64
import plotly.graph_objects as go
plt.figure(figsize=(10,10))
#fig.update_layout(title_text="pie chart for case_status i.e. Target Variable")
#title_text="pie chart for case_status i.e. Target Variable"
#plt.suptitle("pie chart for target Variable")
labels=["Certified","Denied"]
fig = go.Figure(data=[go.Pie(labels=labels,values=values,hole=.3,pull= (0, 0.1), title="pie chart for case_status i.e. Target Variable",
titleposition="top center")])
fig.show()
<Figure size 720x720 with 0 Axes>
From the chart it is seen that the Target Variable is Imbalanced
Imbalanced data are types of data where the target class has an uneven distribution of observations, i.e Here Denied value has more count than the Certified value of the dataset.
#data.groupby('continent'),['case_status'].value_counts()
#df.groupby('continent')['case_status'].value_counts(normalize=True).to_frame()*100
continent=data.groupby(['continent'])['case_status'].value_counts(normalize=True).to_frame()*100
continent
| case_status | ||
|---|---|---|
| continent | case_status | |
| Africa | Certified | 72.050817 |
| Denied | 27.949183 | |
| Asia | Certified | 65.310480 |
| Denied | 34.689520 | |
| Europe | Certified | 79.233655 |
| Denied | 20.766345 | |
| North America | Certified | 61.877278 |
| Denied | 38.122722 | |
| Oceania | Certified | 63.541667 |
| Denied | 36.458333 | |
| South America | Certified | 57.863850 |
| Denied | 42.136150 |
plt.figure(figsize=(13,10))
plt.suptitle('Continent vs case_status',fontsize='large',fontweight='30',weight='extra bold')
sns.countplot(x="continent",hue="case_status",data=data,palette ="gist_rainbow",ec = "black")
plt.xlabel("Continent",fontweight='30',weight='extra bold')
plt.ylabel("Count",fontweight='30',weight='extra bold')
plt.legend(title="Visa Status", fancybox=True)
plt.show()
df2=data.copy()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['continent'].value_counts()/df2['continent'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['continent'].value_counts()/df2['continent'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('Continent' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
data.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
data.groupby(['education_of_employee'])['case_status'].value_counts(normalize=True).to_frame()*100
| case_status | ||
|---|---|---|
| education_of_employee | case_status | |
| Bachelor's | Certified | 62.214188 |
| Denied | 37.785812 | |
| Doctorate | Certified | 87.226277 |
| Denied | 12.773723 | |
| High School | Denied | 65.964912 |
| Certified | 34.035088 | |
| Master's | Certified | 78.627777 |
| Denied | 21.372223 |
plt.figure(figsize=(10,8))
plt.title("Education vs case_status",fontsize='large',fontweight='30',weight='extra bold')
sns.countplot(x="education_of_employee",data=data,hue='case_status',ec='black',palette="gist_rainbow")
plt.xlabel("Education Employee",fontsize='large',weight=20)
plt.ylabel("Count",fontsize='large',weight=20)
plt.legend(title="Visa Status", fancybox=True)
plt.xticks(rotation=0)
plt.show()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['education_of_employee'].value_counts()/df2['education_of_employee'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['education_of_employee'].value_counts()/df2['education_of_employee'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('Education Employee' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
df2.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
df2.groupby('has_job_experience')['case_status'].value_counts(normalize=True).to_frame()*100
| case_status | ||
|---|---|---|
| has_job_experience | case_status | |
| N | Certified | 56.134108 |
| Denied | 43.865892 | |
| Y | Certified | 74.476422 |
| Denied | 25.523578 |
plt.figure(figsize=(10,8))
plt.title("Previous Work Experience of Employee vs case_status",fontsize='large',fontweight='30',weight='extra bold')
sns.countplot(x="has_job_experience",data=data,hue='case_status',ec='black',palette="gist_rainbow")
plt.xlabel("Work Experience of Employee ",fontsize='large',weight=20)
plt.ylabel("Count",fontsize='large',weight=20)
plt.legend(title="Visa Status", fancybox=True)
plt.xticks(rotation=0)
plt.show()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['has_job_experience'].value_counts()/df2['has_job_experience'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified vs Job Experience" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['has_job_experience'].value_counts()/df2['has_job_experience'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('has_job_experience' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
df2.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
df2.groupby("requires_job_training")['case_status'].value_counts(normalize=True).to_frame()*100
| case_status | ||
|---|---|---|
| requires_job_training | case_status | |
| N | Certified | 66.645949 |
| Denied | 33.354051 | |
| Y | Certified | 67.884941 |
| Denied | 32.115059 |
plt.subplots(figsize=(10,13))
sns.violinplot(x="case_status",y='no_of_employees', data=data, palette="Accent")
plt.title("Number of employees vs Visa Status", weight="bold",fontsize=20, pad=20)
plt.ylabel("Count", weight="bold", fontsize=12)
plt.xlabel("Requires Job Training for Employee", weight="bold", fontsize=16)
plt.ylim()
plt.show()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['requires_job_training'].value_counts()/df2['requires_job_training'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified vs Required training" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['requires_job_training'].value_counts()/df2['requires_job_training'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('requires_job_training' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
df2.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
plt.subplots(figsize=(10,13))
sns.violinplot(x="case_status",y='no_of_employees', data=data, palette="Accent")
plt.title("Number of employees vs Visa Status", weight="bold",fontsize=20, pad=20)
plt.ylabel("Count", weight="bold", fontsize=12)
plt.xlabel("Requires Job Training for Employee", weight="bold", fontsize=16)
plt.ylim()
plt.show()
Insights
The distrubution of both is similar. But there are outliers in both the classes which need to be handled.
df2.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
data.groupby('unit_of_wage')["case_status"].value_counts(normalize=True).to_frame()*100
plt.subplots(figsize=(14,7))
sns.countplot(x="unit_of_wage",hue="case_status", data=data,ec='k', palette='Accent')
plt.title("No of Employees in company vs Visa Status", weight="bold",fontsize=20, pad=20)
plt.xlabel("No of Employees", weight="bold", fontsize=16)
plt.ylabel("Count", weight="bold", fontsize=12)
plt.show()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['unit_of_wage'].value_counts()/df2['unit_of_wage'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified vs contract time" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['unit_of_wage'].value_counts()/df2['unit_of_wage'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('unit_of_wage' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
df2.head()
data.groupby(['region_of_employment'])['case_status'].value_counts(normalize=True).to_frame()*100
plt.figure(figsize=(10,10))
plt.title('Region of Employement',weight='bold',fontsize=20,pad=20)
sns.countplot(x='region_of_employment',hue='case_status',data=data,ec='black')
plt.xlabel('Region of employment',weight='bold',fontsize=10)
plt.ylabel('Count',weight='bold')
plt.show()
plt.figure(figsize=[14,7])
(100*df2[df2["case_status"].isin(['Certified'])]['region_of_employment'].value_counts()/df2['region_of_employment'].value_counts()).plot(
kind='bar',stacked=True , colormap='Accent')
plt.title("Percentage of Certified vs employment region" , fontsize = 15, fontweight ='bold' )
order1 = (100*df2[df2["case_status"].isin(['Certified'])]['region_of_employment'].value_counts()/df2['region_of_employment'].value_counts())
for n in range(order1.shape[0]):
count = order1[n]
strt='{:0.1f}%'.format(count)
plt.text(n,count+0.1,strt,ha='center')
plt.xlabel('region_of_employment' , fontweight ='bold')
plt.xticks(rotation=0)
plt.show()
df2.head()
data.groupby(['prevailing_wage'])['case_status'].value_counts().to_frame()
plt.figure(figsize=(10,7))
plt.title('prevailing_wage vs Visa_status',weight='bold',fontsize=20,pad=20)
sns.histplot(x='prevailing_wage',hue="case_status",data=data)
plt.xlabel("prevailing_wage", weight="bold", fontsize=16)
plt.ylabel("Count", weight="bold", fontsize=12)
plt.show()
df2.head()
data.groupby(['education_of_employee'])['prevailing_wage'].mean().to_frame().sort_values(by='prevailing_wage',ascending=False)
plt.subplots(figsize=(14,7))
sns.boxplot(y="education_of_employee",x = "prevailing_wage", data=data,palette='bright')
plt.title("Region of Employment vs Visa Status", weight="bold",fontsize=20, pad=20)
plt.xlabel("Prevailing Wage", weight="bold", fontsize=16)
plt.ylabel("Education", weight="bold", fontsize=12)
plt.show()
plt.subplots(figsize=(14,7))
sns.histplot(x = "yr_of_estab", data=data,palette='gnuplot', bins=40, hue='case_status')
plt.title("Region of Employment vs Visa Status", weight="bold",fontsize=20, pad=20)
plt.xlabel("Prevailing Wage", weight="bold", fontsize=16)
plt.ylabel("Education", weight="bold", fontsize=12)
plt.show()
df2.head()
| case_id | continent | education_of_employee | has_job_experience | requires_job_training | no_of_employees | yr_of_estab | region_of_employment | prevailing_wage | unit_of_wage | full_time_position | case_status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | EZYV01 | Asia | High School | N | N | 14513 | 2007 | West | 592.2029 | Hour | Y | Denied |
| 1 | EZYV02 | Asia | Master's | Y | N | 2412 | 2002 | Northeast | 83425.6500 | Year | Y | Certified |
| 2 | EZYV03 | Asia | Bachelor's | N | Y | 44444 | 2008 | West | 122996.8600 | Year | Y | Denied |
| 3 | EZYV04 | Asia | Bachelor's | N | N | 98 | 1897 | West | 83434.0300 | Year | Y | Denied |
| 4 | EZYV05 | Africa | Master's | Y | N | 1082 | 2005 | South | 149907.3900 | Year | Y | Certified |
missing_values=[feature for feature in data.columns if data[feature].isnull().sum()>=1]
missing_values
[]
There are no missing values.